package com.jscloud.spark.scalacount

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo3 {
  def main(args: Array[String]): Unit = {
    //程序入口类 SparkContext
    val sparkConf: SparkConf = new SparkConf().setAppName("groupByKey").setMaster("local[*]")
    val sc: SparkContext = new SparkContext(sparkConf)
    sc.setLogLevel("WARN")

    val rdd1: RDD[String] = sc.parallelize(Array("hello", "hadoop", "hello", "spark"), 1)
    val rdd2: RDD[(String, Int)] = rdd1.map((_, 1))
    val result: RDD[(String, Iterable[Int])] = rdd2.groupByKey()

    result.foreach(println(_))

    println()
    println()

    result.map(x => (x._1, x._2.size)).foreach(x => print(x + "\t"))
    sc.stop()

  }

}
